{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/usr/local/lib/python3.5/dist-packages/sklearn/cross_validation.py:41: DeprecationWarning: This module was deprecated in version 0.18 in favor of the model_selection module into which all the refactored classes and functions are moved. Also note that the interface of the new CV iterators are different from that of this module. This module will be removed in 0.20.\n",
      "  \"This module will be removed in 0.20.\", DeprecationWarning)\n"
     ]
    }
   ],
   "source": [
    "import pandas as pd\n",
    "import re\n",
    "import numpy as np\n",
    "from utils import *\n",
    "import time\n",
    "from sklearn.preprocessing import LabelEncoder\n",
    "from sklearn.cross_validation import train_test_split\n",
    "from unidecode import unidecode\n",
    "import pickle"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style>\n",
       "    .dataframe thead tr:only-child th {\n",
       "        text-align: right;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: left;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>label</th>\n",
       "      <th>text</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>Negative</td>\n",
       "      <td>Lebih-lebih lagi dengan  kemudahan internet da...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>Positive</td>\n",
       "      <td>boleh memberi teguran kepada parti tetapi perl...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>Negative</td>\n",
       "      <td>Adalah membingungkan mengapa masyarakat Cina b...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>Positive</td>\n",
       "      <td>Kami menurunkan defisit daripada 6.7 peratus p...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>Negative</td>\n",
       "      <td>Ini masalahnya. Bukan rakyat, tetapi sistem</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "      label                                               text\n",
       "0  Negative  Lebih-lebih lagi dengan  kemudahan internet da...\n",
       "1  Positive  boleh memberi teguran kepada parti tetapi perl...\n",
       "2  Negative  Adalah membingungkan mengapa masyarakat Cina b...\n",
       "3  Positive  Kami menurunkan defisit daripada 6.7 peratus p...\n",
       "4  Negative        Ini masalahnya. Bukan rakyat, tetapi sistem"
      ]
     },
     "execution_count": 2,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df = pd.read_csv('sentiment-news-bahasa-v5.csv')\n",
    "Y = LabelEncoder().fit_transform(df.label)\n",
    "df.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "def textcleaning(string):\n",
    "    string = re.sub('http\\S+|www.\\S+', '',' '.join([i for i in string.split() if i.find('#')<0 and i.find('@')<0]))\n",
    "    string = unidecode(string).replace('.', '. ').replace(',', ', ')\n",
    "    string = re.sub('[^\\'\\\"A-Za-z\\- ]+', ' ', string)\n",
    "    return ' '.join([i for i in re.findall(\"[\\\\w']+|[;:\\-\\(\\)&.,!?\\\"]\", string) if len(i)>1]).lower()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
    "for i in range(df.shape[0]):\n",
    "    df.iloc[i,1] = textcleaning(df.iloc[i,1])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [],
   "source": [
    "with open('polarity-negative-translated.txt','r') as fopen:\n",
    "    texts = fopen.read().split('\\n')\n",
    "labels = [0] * len(texts)\n",
    "\n",
    "with open('polarity-positive-translated.txt','r') as fopen:\n",
    "    positive_texts = fopen.read().split('\\n')\n",
    "labels += [1] * len(positive_texts)\n",
    "texts += positive_texts\n",
    "texts += df.iloc[:,1].tolist()\n",
    "labels += Y.tolist()\n",
    "\n",
    "assert len(labels) == len(texts)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Using TensorFlow backend.\n"
     ]
    }
   ],
   "source": [
    "from sklearn.feature_extraction.text import TfidfVectorizer\n",
    "import xgboost as xgb\n",
    "from malaya.text_functions import STOPWORDS"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(14279, 45344)"
      ]
     },
     "execution_count": 7,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "target = LabelEncoder().fit_transform(labels)\n",
    "tfidf = TfidfVectorizer(ngram_range=(1, 3),min_df=2).fit(texts)\n",
    "vectors = tfidf.transform(texts)\n",
    "vectors.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [],
   "source": [
    "train_X, test_X, train_Y, test_Y = train_test_split(vectors, target, test_size = 0.2)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [],
   "source": [
    "from sklearn import metrics"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[0]\tvalidation-mlogloss:0.688333\n",
      "Will train until validation-mlogloss hasn't improved in 100 rounds.\n",
      "[5]\tvalidation-mlogloss:0.667841\n",
      "[10]\tvalidation-mlogloss:0.654925\n",
      "[15]\tvalidation-mlogloss:0.645515\n",
      "[20]\tvalidation-mlogloss:0.638027\n",
      "[25]\tvalidation-mlogloss:0.632308\n",
      "[30]\tvalidation-mlogloss:0.627757\n",
      "[35]\tvalidation-mlogloss:0.624292\n",
      "[40]\tvalidation-mlogloss:0.620937\n",
      "[45]\tvalidation-mlogloss:0.618054\n",
      "[50]\tvalidation-mlogloss:0.61473\n",
      "[55]\tvalidation-mlogloss:0.612535\n",
      "[60]\tvalidation-mlogloss:0.610556\n",
      "[65]\tvalidation-mlogloss:0.6082\n",
      "[70]\tvalidation-mlogloss:0.606007\n",
      "[75]\tvalidation-mlogloss:0.603937\n",
      "[80]\tvalidation-mlogloss:0.601853\n",
      "[85]\tvalidation-mlogloss:0.600333\n",
      "[90]\tvalidation-mlogloss:0.598523\n",
      "[95]\tvalidation-mlogloss:0.597057\n",
      "[100]\tvalidation-mlogloss:0.596089\n",
      "[105]\tvalidation-mlogloss:0.594734\n",
      "[110]\tvalidation-mlogloss:0.593502\n",
      "[115]\tvalidation-mlogloss:0.592232\n",
      "[120]\tvalidation-mlogloss:0.590867\n",
      "[125]\tvalidation-mlogloss:0.589868\n",
      "[130]\tvalidation-mlogloss:0.58882\n",
      "[135]\tvalidation-mlogloss:0.587663\n",
      "[140]\tvalidation-mlogloss:0.5864\n",
      "[145]\tvalidation-mlogloss:0.585439\n",
      "[150]\tvalidation-mlogloss:0.584447\n",
      "[155]\tvalidation-mlogloss:0.583611\n",
      "[160]\tvalidation-mlogloss:0.582862\n",
      "[165]\tvalidation-mlogloss:0.581904\n",
      "[170]\tvalidation-mlogloss:0.580833\n",
      "[175]\tvalidation-mlogloss:0.579819\n",
      "[180]\tvalidation-mlogloss:0.578625\n",
      "[185]\tvalidation-mlogloss:0.577797\n",
      "[190]\tvalidation-mlogloss:0.576951\n",
      "[195]\tvalidation-mlogloss:0.576994\n",
      "[200]\tvalidation-mlogloss:0.576557\n",
      "[205]\tvalidation-mlogloss:0.575801\n",
      "[210]\tvalidation-mlogloss:0.575473\n",
      "[215]\tvalidation-mlogloss:0.574753\n",
      "[220]\tvalidation-mlogloss:0.574182\n",
      "[225]\tvalidation-mlogloss:0.573918\n",
      "[230]\tvalidation-mlogloss:0.57349\n",
      "[235]\tvalidation-mlogloss:0.573161\n",
      "[240]\tvalidation-mlogloss:0.572811\n",
      "[245]\tvalidation-mlogloss:0.572279\n",
      "[250]\tvalidation-mlogloss:0.572197\n",
      "[255]\tvalidation-mlogloss:0.571704\n",
      "[260]\tvalidation-mlogloss:0.571143\n",
      "[265]\tvalidation-mlogloss:0.570941\n",
      "[270]\tvalidation-mlogloss:0.570849\n",
      "[275]\tvalidation-mlogloss:0.570425\n",
      "[280]\tvalidation-mlogloss:0.570263\n",
      "[285]\tvalidation-mlogloss:0.56994\n",
      "[290]\tvalidation-mlogloss:0.569922\n",
      "[295]\tvalidation-mlogloss:0.569903\n",
      "[300]\tvalidation-mlogloss:0.569546\n",
      "[305]\tvalidation-mlogloss:0.569218\n",
      "[310]\tvalidation-mlogloss:0.569078\n",
      "[315]\tvalidation-mlogloss:0.568178\n",
      "[320]\tvalidation-mlogloss:0.567893\n",
      "[325]\tvalidation-mlogloss:0.567415\n",
      "[330]\tvalidation-mlogloss:0.567254\n",
      "[335]\tvalidation-mlogloss:0.567432\n",
      "[340]\tvalidation-mlogloss:0.566868\n",
      "[345]\tvalidation-mlogloss:0.566662\n",
      "[350]\tvalidation-mlogloss:0.566777\n",
      "[355]\tvalidation-mlogloss:0.566416\n",
      "[360]\tvalidation-mlogloss:0.566186\n",
      "[365]\tvalidation-mlogloss:0.566218\n",
      "[370]\tvalidation-mlogloss:0.566141\n",
      "[375]\tvalidation-mlogloss:0.565985\n",
      "[380]\tvalidation-mlogloss:0.566031\n",
      "[385]\tvalidation-mlogloss:0.565692\n",
      "[390]\tvalidation-mlogloss:0.565616\n",
      "[395]\tvalidation-mlogloss:0.565521\n",
      "[400]\tvalidation-mlogloss:0.565091\n",
      "[405]\tvalidation-mlogloss:0.564603\n",
      "[410]\tvalidation-mlogloss:0.564504\n",
      "[415]\tvalidation-mlogloss:0.564582\n",
      "[420]\tvalidation-mlogloss:0.564436\n",
      "[425]\tvalidation-mlogloss:0.564468\n",
      "[430]\tvalidation-mlogloss:0.564648\n",
      "[435]\tvalidation-mlogloss:0.564458\n",
      "[440]\tvalidation-mlogloss:0.564489\n",
      "[445]\tvalidation-mlogloss:0.564478\n",
      "[450]\tvalidation-mlogloss:0.56411\n",
      "[455]\tvalidation-mlogloss:0.563945\n",
      "[460]\tvalidation-mlogloss:0.563942\n",
      "[465]\tvalidation-mlogloss:0.564031\n",
      "[470]\tvalidation-mlogloss:0.564233\n",
      "[475]\tvalidation-mlogloss:0.564116\n",
      "[480]\tvalidation-mlogloss:0.564342\n",
      "[485]\tvalidation-mlogloss:0.564267\n",
      "[490]\tvalidation-mlogloss:0.564168\n",
      "[495]\tvalidation-mlogloss:0.564042\n",
      "[500]\tvalidation-mlogloss:0.56404\n",
      "[505]\tvalidation-mlogloss:0.56411\n",
      "[510]\tvalidation-mlogloss:0.563783\n",
      "[515]\tvalidation-mlogloss:0.564187\n",
      "[520]\tvalidation-mlogloss:0.564173\n",
      "[525]\tvalidation-mlogloss:0.564342\n",
      "[530]\tvalidation-mlogloss:0.564231\n",
      "[535]\tvalidation-mlogloss:0.564459\n",
      "[540]\tvalidation-mlogloss:0.564475\n",
      "[545]\tvalidation-mlogloss:0.564458\n",
      "[550]\tvalidation-mlogloss:0.564581\n",
      "[555]\tvalidation-mlogloss:0.56459\n",
      "Stopping. Best iteration:\n",
      "[459]\tvalidation-mlogloss:0.563744\n",
      "\n"
     ]
    }
   ],
   "source": [
    "train_d = xgb.DMatrix(train_X, train_Y)\n",
    "test_d = xgb.DMatrix(test_X, test_Y)\n",
    "params_xgd = {\n",
    "    'min_child_weight': 10.0,\n",
    "    'max_depth': 7,\n",
    "    'objective': 'multi:softprob',\n",
    "    'max_delta_step': 1.8,\n",
    "    'num_class': 2,\n",
    "    'colsample_bytree': 0.4,\n",
    "    'subsample': 0.8,\n",
    "    'learning_rate': 0.1,\n",
    "    'gamma': 0.65,\n",
    "    'silent': True,\n",
    "    'eval_metric': 'mlogloss'\n",
    "}\n",
    "model = xgb.train(params_xgd, train_d, 10000, evals=[(test_d, 'validation')], \n",
    "                  early_stopping_rounds=100, verbose_eval=5)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "             precision    recall  f1-score   support\n",
      "\n",
      "   negative       0.70      0.64      0.67      1332\n",
      "   positive       0.71      0.76      0.73      1524\n",
      "\n",
      "avg / total       0.71      0.71      0.70      2856\n",
      "\n"
     ]
    }
   ],
   "source": [
    "predicted = np.argmax(model.predict(xgb.DMatrix(test_X),ntree_limit=model.best_ntree_limit),axis=1)\n",
    "print(metrics.classification_report(test_Y, predicted, target_names = ['negative','positive']))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [],
   "source": [
    "with open('xgboost-sentiment.pkl','wb') as fopen:\n",
    "    pickle.dump(model,fopen)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {},
   "outputs": [],
   "source": [
    "with open('tfidf-xgboost-sentiment.pkl','wb') as fopen:\n",
    "    pickle.dump(tfidf,fopen)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.5.2"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
